Creating new columns converted from the measure per 100 g to the measure per serving.


In [2]:
import pandas as pd

In [3]:
#Load dataset
data = pd.read_csv("demo_food_data.csv", sep = ",")

In [4]:
for c in data.columns:
    print(c)


Unnamed: 0
code
creator
product_name
generic_name
quantity
brands
brands_tags
categories
categories_tags
serving_size
energy_100g
fat_100g
cholesterol_100g
carbohydrates_100g
sugars_100g
starch_100g
fiber_100g
proteins_100g
salt_100g
sodium_100g
alcohol_100g
folates_100g
bicarbonate_100g
potassium_100g
chloride_100g
calcium_100g
iron_100g
fluoride_100g
iodine_100g
caffeine_100g
cocoa_100g
demo_cat

In [14]:
# Take a quick look
data.head()


Out[14]:
Unnamed: 0 code creator product_name generic_name quantity brands brands_tags categories categories_tags ... bicarbonate_100g potassium_100g chloride_100g calcium_100g iron_100g fluoride_100g iodine_100g caffeine_100g cocoa_100g demo_cat
0 1816 11110008817 usda-ndb-import Maple Glazed Cake Donuts NaN NaN Kroger, The Kroger Co. kroger,the-kroger-co NaN NaN ... NaN NaN NaN 0.113 0.00203 NaN NaN NaN NaN donut
1 2271 11110198990 usda-ndb-import Cake Donuts, S'Mores NaN NaN Bakery Fresh Goodness, The Kroger Co. bakery-fresh-goodness,the-kroger-co NaN NaN ... NaN NaN NaN 0.065 0.00232 NaN NaN NaN NaN donut
2 5599 11150146098 usda-ndb-import Kids Graham Crackers, Chocolate NaN NaN Roundy's roundy-s NaN NaN ... NaN NaN NaN 0.000 0.00360 NaN NaN NaN NaN cracker
3 6793 11153041949 usda-ndb-import Crunchy Oat Squares With Cinnamon Cereal NaN NaN Foodtown, Foodtown Inc. foodtown,foodtown-inc NaN NaN ... NaN 0.438 NaN 0.312 0.02250 NaN NaN NaN NaN cereal
4 9226 11225127700 usda-ndb-import Sweet P's Bake Shop, Jumbo Glazed Raised Donuts NaN NaN Kingston Marketing Co kingston-marketing-co NaN NaN ... NaN NaN NaN 0.103 0.00138 NaN NaN NaN NaN donut

5 rows × 33 columns

Now going to go through and find items that have certain category words in the product name. Then filter these to exclude the most often word that is confused in there (e.g. donut flavor coffee gets picked up under donut).

Then going to sort each of these based on the rank of items on key factors like sugar. And for each factor, going to pick items that are at specified percentiles, so we get a wide range on those factors.


In [5]:
#Define the columns we want to convert
original_names = ['energy_100g', 'fat_100g', 'cholesterol_100g','carbohydrates_100g','sugars_100g','starch_100g','fiber_100g',
              'proteins_100g','salt_100g','sodium_100g','alcohol_100g','folates_100g','bicarbonate_100g',
'potassium_100g','chloride_100g','calcium_100g','iron_100g','fluoride_100g','iodine_100g','caffeine_100g','cocoa_100g']

#Create new variable names
new_names=[]
for i in range(len(original_names)):
    new_names.append(original_names[i].split("_")[0])
    
#Parse the serving size and calculate the factor to convert 100 g to serving g
f = lambda x: (x["serving_size"].split(" g")[0])
data["serv_size"] = data.apply(f, axis=1)

f = lambda x: (float(x["serv_size"])/100)
data["serv_factor"] = data.apply(f, axis=1)


#For each target column, create a new column and calculate measure per serving
f = lambda x: (float(x["energy_100g"])*float(x["serv_factor"]))
data["energy"] = data.apply(f, axis=1)

f = lambda x: (float(x["fat_100g"])*float(x["serv_factor"]))
data["fat"] = data.apply(f, axis=1)

f = lambda x: (float(x["cholesterol_100g"])*float(x["serv_factor"]))
data["cholesterol"] = data.apply(f, axis=1)

f = lambda x: (float(x["carbohydrates_100g"])*float(x["serv_factor"]))
data["carbohydrates"] = data.apply(f, axis=1)

f = lambda x: (float(x["sugars_100g"])*float(x["serv_factor"]))
data["sugars"] = data.apply(f, axis=1)

f = lambda x: (float(x["starch_100g"])*float(x["serv_factor"]))
data["starch"] = data.apply(f, axis=1)

f = lambda x: (float(x["fiber_100g"])*float(x["serv_factor"]))
data["fiber"] = data.apply(f, axis=1)

f = lambda x: (float(x["proteins_100g"])*float(x["serv_factor"]))
data["proteins"] = data.apply(f, axis=1)

f = lambda x: (float(x["salt_100g"])*float(x["serv_factor"]))
data["salt"] = data.apply(f, axis=1)

f = lambda x: (float(x["sodium_100g"])*float(x["serv_factor"]))
data["sodium"] = data.apply(f, axis=1)

f = lambda x: (float(x["alcohol_100g"])*float(x["serv_factor"]))
data["alcohol"] = data.apply(f, axis=1)

f = lambda x: (float(x["folates_100g"])*float(x["serv_factor"]))
data["folates"] = data.apply(f, axis=1)

f = lambda x: (float(x["bicarbonate_100g"])*float(x["serv_factor"]))
data["bicarbonate"] = data.apply(f, axis=1)

f = lambda x: (float(x["potassium_100g"])*float(x["serv_factor"]))
data["potassium"] = data.apply(f, axis=1)

f = lambda x: (float(x["chloride_100g"])*float(x["serv_factor"]))
data["chloride"] = data.apply(f, axis=1)

f = lambda x: (float(x["calcium_100g"])*float(x["serv_factor"]))
data["calcium"] = data.apply(f, axis=1)

f = lambda x: (float(x["iron_100g"])*float(x["serv_factor"]))
data["iron"] = data.apply(f, axis=1)

f = lambda x: (float(x["fluoride_100g"])*float(x["serv_factor"]))
data["fluoride"] = data.apply(f, axis=1)

f = lambda x: (float(x["iodine_100g"])*float(x["serv_factor"]))
data["iodine"] = data.apply(f, axis=1)

f = lambda x: (float(x["caffeine_100g"])*float(x["serv_factor"]))
data["caffeine"] = data.apply(f, axis=1)

f = lambda x: (float(x["cocoa_100g"])*float(x["serv_factor"]))
data["cocoa"] = data.apply(f, axis=1)

In [6]:
# Take a look at what we built
data.head()


Out[6]:
Unnamed: 0 code creator product_name generic_name quantity brands brands_tags categories categories_tags ... folates bicarbonate potassium chloride calcium iron fluoride iodine caffeine cocoa
0 1816 11110008817 usda-ndb-import Maple Glazed Cake Donuts NaN NaN Kroger, The Kroger Co. kroger,the-kroger-co NaN NaN ... NaN NaN NaN NaN 0.08023 0.001441 NaN NaN NaN NaN
1 2271 11110198990 usda-ndb-import Cake Donuts, S'Mores NaN NaN Bakery Fresh Goodness, The Kroger Co. bakery-fresh-goodness,the-kroger-co NaN NaN ... NaN NaN NaN NaN 0.04030 0.001438 NaN NaN NaN NaN
2 5599 11150146098 usda-ndb-import Kids Graham Crackers, Chocolate NaN NaN Roundy's roundy-s NaN NaN ... NaN NaN NaN NaN 0.00000 0.001080 NaN NaN NaN NaN
3 6793 11153041949 usda-ndb-import Crunchy Oat Squares With Cinnamon Cereal NaN NaN Foodtown, Foodtown Inc. foodtown,foodtown-inc NaN NaN ... NaN NaN 0.14016 NaN 0.09984 0.007200 NaN NaN NaN NaN
4 9226 11225127700 usda-ndb-import Sweet P's Bake Shop, Jumbo Glazed Raised Donuts NaN NaN Kingston Marketing Co kingston-marketing-co NaN NaN ... NaN NaN NaN NaN 0.08034 0.001076 NaN NaN NaN NaN

5 rows × 56 columns


In [7]:
# Now write it out to disk
outfile = "demo_food_data_sd_regularized_columns.csv"
data.to_csv(outfile)